The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.

Dependencies

User defined parameters

print(params)
## $run
## [1] TRUE
## 
## $test_run
## [1] FALSE
## 
## $save_figs
## [1] FALSE
## 
## $ecoregion
## [1] "shrubGrass"
## 
## $response
## [1] "ShrubCover"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)


run <- params$run
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
 
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc. 
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)

read in data

Data compiled in the prepDataForModels.R script

here::i_am("Analysis/VegComposition/ModelFitting/02_ModelFitting.Rmd")
modDat <- readRDS( here("Data_processed", "CoverData", "DataForModels_spatiallyAveraged_withSoils_noSf.rds"))
## there are some values of the annual wet degree days 5th percentile that have -Inf?? change to lowest value for now? 
modDat[is.infinite(modDat$annWetDegDays_5percentile_3yrAnom), "annWetDegDays_5percentile_3yrAnom"] <- -47.8
## same, but for annual water deficit 95th percentile 
modDat[is.infinite(modDat$annWaterDeficit_95percentile_3yrAnom), "annWaterDeficit_95percentile_3yrAnom"] <- -600

# # Convert total cover variables into proportions (for later use in beta regression models) ... proportions are already scaled from zero to 1
# modDat <- modDat %>%
#   mutate(TotalTreeCover = TotalTreeCover/100,
#          CAMCover = CAMCover/100,
#          TotalHerbaceousCover = TotalHerbaceousCover/100,
#          BareGroundCover = BareGroundCover/100,
#          ShrubCover = ShrubCover/100
#          )
# For all response variables, make sure there are no 0s add or subtract .0001 from each, since the Gamma model framework can't handle that
modDat[modDat$TotalTreeCover == 0 & !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.0001
modDat[modDat$CAMCover == 0 & !is.na(modDat$CAMCover), "CAMCover"] <- 0.0001
modDat[modDat$TotalHerbaceousCover == 0  & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.0001
modDat[modDat$BareGroundCover == 0 & !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.0001
modDat[modDat$ShrubCover == 0 & !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.0001
modDat[modDat$BroadleavedTreeCover_prop == 0 & !is.na(modDat$BroadleavedTreeCover_prop), "BroadleavedTreeCover_prop"] <- 0.0001
modDat[modDat$NeedleLeavedTreeCover_prop == 0 & !is.na(modDat$NeedleLeavedTreeCover_prop), "NeedleLeavedTreeCover_prop"] <- 0.0001
modDat[modDat$C4Cover_prop == 0 & !is.na(modDat$C4Cover_prop), "C4Cover_prop"] <- 0.0001
modDat[modDat$C3Cover_prop == 0 & !is.na(modDat$C3Cover_prop), "C3Cover_prop"] <- 0.0001
modDat[modDat$ForbCover_prop == 0 & !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.0001
# 
# modDat[modDat$TotalTreeCover ==1& !is.na(modDat$TotalTreeCover), "TotalTreeCover"] <- 0.999
# modDat[modDat$CAMCover ==1& !is.na(modDat$CAMCover), "CAMCover"] <- 0.999
# modDat[modDat$TotalHerbaceousCover ==1 & !is.na(modDat$TotalHerbaceousCover), "TotalHerbaceousCover"] <- 0.999
# modDat[modDat$BareGroundCover ==1& !is.na(modDat$BareGroundCover), "BareGroundCover"] <- 0.999
# modDat[modDat$ShrubCover ==1& !is.na(modDat$ShrubCover), "ShrubCover"] <- 0.999
# modDat[modDat$BroadleavedTreeCover_prop ==1& !is.na(modDat$BroadleavedTreeCover_prop), "BroadleavedTreeCover_prop"] <- 0.999
# modDat[modDat$NeedleLeavedTreeCover_prop ==1& !is.na(modDat$NeedleLeavedTreeCover_prop), "NeedleLeavedTreeCover_prop"] <- 0.999
# modDat[modDat$C4Cover_prop ==1& !is.na(modDat$C4Cover_prop), "C4Cover_prop"] <- 0.999
# modDat[modDat$C3Cover_prop ==1& !is.na(modDat$C3Cover_prop), "C3Cover_prop"] <- 0.999
# modDat[modDat$ForbCover_prop ==1& !is.na(modDat$ForbCover_prop), "ForbCover_prop"] <- 0.999

Prep data

set.seed(1234)
modDat_1 <- modDat %>% 
  select(-c(prcp_annTotal:annVPD_min)) %>% 
  # mutate(Lon = st_coordinates(.)[,1], 
  #        Lat = st_coordinates(.)[,2])  %>% 
  # st_drop_geometry() %>% 
  # filter(!is.na(newRegion))
  rename("tmin" = tmin_meanAnnAvg_CLIM, 
     "tmax" = tmax_meanAnnAvg_CLIM, #1
     "tmean" = tmean_meanAnnAvg_CLIM, 
     "prcp" = prcp_meanAnnTotal_CLIM, 
     "t_warm" = T_warmestMonth_meanAnnAvg_CLIM,
     "t_cold" = T_coldestMonth_meanAnnAvg_CLIM, 
     "prcp_wet" = precip_wettestMonth_meanAnnAvg_CLIM,
     "prcp_dry" = precip_driestMonth_meanAnnAvg_CLIM, 
     "prcp_seasonality" = precip_Seasonality_meanAnnAvg_CLIM, #2
     "prcpTempCorr" = PrecipTempCorr_meanAnnAvg_CLIM,  #3
     "abvFreezingMonth" = aboveFreezing_month_meanAnnAvg_CLIM, 
     "isothermality" = isothermality_meanAnnAvg_CLIM, #4
     "annWatDef" = annWaterDeficit_meanAnnAvg_CLIM, 
     "annWetDegDays" = annWetDegDays_meanAnnAvg_CLIM,
     "VPD_mean" = annVPD_mean_meanAnnAvg_CLIM, 
     "VPD_max" = annVPD_max_meanAnnAvg_CLIM, #5
     "VPD_min" = annVPD_min_meanAnnAvg_CLIM, #6
     "VPD_max_95" = annVPD_max_95percentile_CLIM, 
     "annWatDef_95" = annWaterDeficit_95percentile_CLIM, 
     "annWetDegDays_5" = annWetDegDays_5percentile_CLIM, 
     "frostFreeDays_5" = durationFrostFreeDays_5percentile_CLIM, 
     "frostFreeDays" = durationFrostFreeDays_meanAnnAvg_CLIM, 
     "soilDepth" = soilDepth, #7
     "clay" = surfaceClay_perc, 
     "sand" = avgSandPerc_acrossDepth, #8
     "coarse" = avgCoarsePerc_acrossDepth, #9
     "carbon" = avgOrganicCarbonPerc_0_3cm, #10
     "AWHC" = totalAvailableWaterHoldingCapacity,
     ## anomaly variables
     tmean_anom = tmean_meanAnnAvg_3yrAnom, #15
     tmin_anom = tmin_meanAnnAvg_3yrAnom, #16
     tmax_anom = tmax_meanAnnAvg_3yrAnom, #17
    prcp_anom = prcp_meanAnnTotal_3yrAnom, #18
      t_warm_anom = T_warmestMonth_meanAnnAvg_3yrAnom,  #19
     t_cold_anom = T_coldestMonth_meanAnnAvg_3yrAnom, #20
      prcp_wet_anom = precip_wettestMonth_meanAnnAvg_3yrAnom, #21
      precp_dry_anom = precip_driestMonth_meanAnnAvg_3yrAnom,  #22
    prcp_seasonality_anom = precip_Seasonality_meanAnnAvg_3yrAnom, #23 
     prcpTempCorr_anom = PrecipTempCorr_meanAnnAvg_3yrAnom, #24
      aboveFreezingMonth_anom = aboveFreezing_month_meanAnnAvg_3yrAnom, #25  
    isothermality_anom = isothermality_meanAnnAvg_3yrAnom, #26
       annWatDef_anom = annWaterDeficit_meanAnnAvg_3yrAnom, #27
     annWetDegDays_anom = annWetDegDays_meanAnnAvg_3yrAnom,  #28
      VPD_mean_anom = annVPD_mean_meanAnnAvg_3yrAnom, #29
      VPD_min_anom = annVPD_min_meanAnnAvg_3yrAnom,  #30
      VPD_max_anom = annVPD_max_meanAnnAvg_3yrAnom,  #31
     VPD_max_95_anom = annVPD_max_95percentile_3yrAnom, #32
      annWatDef_95_anom = annWaterDeficit_95percentile_3yrAnom, #33 
      annWetDegDays_5_anom = annWetDegDays_5percentile_3yrAnom ,  #34
    frostFreeDays_5_anom = durationFrostFreeDays_5percentile_3yrAnom, #35 
      frostFreeDays_anom = durationFrostFreeDays_meanAnnAvg_3yrAnom #36
  )

# small dataset for if testing the data
if(test_run) {
  modDat_1 <- slice_sample(modDat_1, n = 1e5)
}

Add a constant to the response variable (+1) so that models run…

modDat_1[,response] <- modDat_1[,response]+1

Identify the ecoregion and response variable type to use in this model run

ecoregion <- params$ecoregion
response <- params$response
print(paste0("In this model run, the ecoregion is ", ecoregion," and the response variable is ",response))
## [1] "In this model run, the ecoregion is shrubGrass and the response variable is ShrubCover"

Subset the data to only include data for the ecoregion of interest

if (ecoregion == "shrubGrass") {
  # select data for the ecoregion of interest
  modDat_1 <- modDat_1 %>%
    filter(newRegion == "dryShrubGrass")
} else if (ecoregion == "forest") {
  # select data for the ecoregion of interest
  modDat_1 <- modDat_1 %>% 
    filter(newRegion %in% c("eastForest", "westForest"))
}

# remove the rows that have no observations for the response variable of interest
modDat_1 <- modDat_1[!is.na(modDat_1[,response]),]

Visualize the response variable

hist(modDat_1[,response], main = paste0("Histogram of ",response),
     xlab = paste0(response))

Visualize the predictor variables

The following are the candidate predictor variables for this ecoregion:

if (ecoregion == "shrubGrass") {
  # select potential predictor variables for the ecoregion of interest
        prednames <-
          c(
"tmean"             , "prcp"                    ,"prcp_seasonality"        ,"prcpTempCorr"          , 
"isothermality"     , "annWatDef"               ,"sand"                    ,"coarse"                , 
"carbon"            , "AWHC"                    ,"tmin_anom"               ,"tmax_anom"             , 
"t_warm_anom"       , "prcp_wet_anom"           ,"precp_dry_anom"          ,"prcp_seasonality_anom" , 
"prcpTempCorr_anom" , "aboveFreezingMonth_anom" ,"isothermality_anom"      ,"annWatDef_anom"        , 
"annWetDegDays_anom", "VPD_mean_anom"           ,"VPD_min_anom"            ,"frostFreeDays_5_anom"   )
  
} else if (ecoregion == "forest") {
  # select potential predictor variables for the ecoregion of interest
  prednames <- 
    c(
"tmean"                 ,"prcp"               , "prcp_dry"                , "prcpTempCorr"      ,     
"isothermality"         ,"annWatDef"          , "clay"                    , "sand"              ,     
"coarse"                ,"carbon"             , "AWHC"                    , "tmin_anom"         ,     
"tmax_anom"             ,"prcp_anom"          , "prcp_wet_anom"           , "precp_dry_anom"    ,     
"prcp_seasonality_anom" ,"prcpTempCorr_anom"  , "aboveFreezingMonth_anom" , "isothermality_anom",     
"annWatDef_anom"        ,"annWetDegDays_anom" , "VPD_mean_anom"           , "VPD_max_95_anom"   ,     
"frostFreeDays_5_anom"   )
}

# subset the data to only include these predictors, and remove any remaining NAs 
modDat_1 <- modDat_1 %>% 
  select(prednames, response, newRegion, Year, Long, Lat, NA_L1NAME, NA_L2NAME) %>% 
  drop_na()

names(prednames) <- prednames
df_pred <- modDat_1[, prednames]
# 
# # print the list of predictor variables
# knitr::kable(format = "html", data.frame("Possible_Predictors" = prednames)
# ) %>%
#   kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
create_summary <- function(df) {
  df %>% 
    pivot_longer(cols = everything(),
                 names_to = 'variable') %>% 
    group_by(variable) %>% 
    summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), 
                                        median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>% 
    mutate(across(where(is.numeric), round, 4))
}

modDat_1[prednames] %>% 
  create_summary() %>% 
  knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
summaries of possible predictor variables
variable value_mean value_min value_median value_max
AWHC 14.3063 0.1456 13.8307 35.1058
VPD_mean_anom -0.0235 -0.3600 -0.0222 0.2407
VPD_min_anom -0.0157 -0.1833 -0.0198 0.1399
aboveFreezingMonth_anom 0.0591 -1.4545 0.0435 2.2143
annWatDef 101.6656 1.5871 91.8627 451.1875
annWatDef_anom -0.1016 -3.9347 -0.0982 1.0000
annWetDegDays_anom 0.0347 -1.6606 0.0358 0.9840
carbon 1.1602 0.0001 0.8130 34.6588
coarse 8.8595 0.0000 5.9192 82.6879
frostFreeDays_5_anom -17.1524 -244.0000 -6.0000 55.8000
isothermality 37.7195 19.2892 37.5258 58.2327
isothermality_anom 0.3550 -6.0230 0.3083 10.8037
prcp 419.0550 90.8317 365.7253 1701.8013
prcpTempCorr 0.0263 -0.7742 0.0572 0.6890
prcpTempCorr_anom 0.0231 -0.6495 0.0283 0.5744
prcp_seasonality 0.9318 0.5201 0.9162 1.7536
prcp_seasonality_anom -0.0295 -0.8955 -0.0192 0.4743
prcp_wet_anom 0.0034 -1.9576 0.0229 0.6958
precp_dry_anom 0.0604 -9.0000 0.3659 1.0000
sand 46.7197 0.3922 46.4960 98.9991
t_warm_anom -0.5141 -6.3974 -0.5241 3.3598
tmax_anom -0.3335 -3.4286 -0.3680 2.3198
tmean 10.5804 -2.2935 9.4381 23.9289
tmin_anom -0.4737 -4.0550 -0.4641 2.3583
# response_summary <- modDat_1 %>% 
#     dplyr::select(#where(is.numeric), -all_of(pred_vars),
#       matches(response)) %>% 
#     create_summary()
# 
# 
# kable(response_summary, 
#       caption = 'summaries of response variables, calculated using paint') %>%
# kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 

Plot predictor vars against each other

set.seed(12011993)
# function for colors
my_fn <- function(data, mapping, method="p", use="pairwise", ...){
  
  # grab data
  x <- eval_data_col(data, mapping$x)
  y <- eval_data_col(data, mapping$y)
  
  # calculate correlation
  corr <- cor(x, y, method=method, use=use)
  
  # calculate colour based on correlation value
  # Here I have set a correlation of minus one to blue, 
  # zero to white, and one to red 
  # Change this to suit: possibly extend to add as an argument of `my_fn`
  colFn <- colorRampPalette(c("red", "white", "blue"), interpolate ='spline')
  fill <- colFn(100)[findInterval(corr, seq(-1, 1, length=100))]
  
  ggally_cor(data = data, mapping = mapping, size = 2.5, stars = FALSE, 
             digits = 2, colour = I("black"),...) + 
    theme_void() +
    theme(panel.background = element_rect(fill=fill))
  
}

if (run == TRUE) {
(corrPlot <- modDat_1 %>% 
  select(prednames) %>% 
  slice_sample(n = 5e4) %>% 
  #select(-matches("_")) %>% 
ggpairs( upper = list(continuous = my_fn, size = .1), lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.1)), progress = FALSE))
    base::saveRDS(corrPlot, paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
  
  } else {
    # corrPlot <- readRDS(paste0("../ModelFitting/models/", response, "_",ecoregion, "_corrPlot.rds"))
    # (corrPlot)
    print(c("See previous correlation figures"))
  }

Predictor variables compared to binned response variables

set.seed(12011993)
# vector of name of response variables
vars_response <- response

# longformat dataframes for making boxplots
df_sample_plots <-  modDat_1  %>% 
  slice_sample(n = 5e4) %>% 
   rename(response = all_of(response)) %>% 
  mutate(response = case_when(
    response <= .25 ~ ".25", 
    response > .25 & response <=.5 ~ ".5", 
    response > .5 & response <=.75 ~ ".75", 
    response >= .75  ~ "1", 
  )) %>% 
  select(c(response, prednames)) %>% 
  tidyr::pivot_longer(cols = unname(prednames), 
               names_to = "predictor", 
               values_to = "value"
               )  
 

  ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
  geom_boxplot() +
  facet_wrap(~predictor , scales = 'free_y') + 
  ylab("Predictor Variable Values") + 
    xlab(response)

Standardize the predictor variables for the model-fitting process

modDat_1_s <- modDat_1 %>% 
  mutate(across(all_of(prednames), base::scale, .names = "{.col}_s")) 
names(modDat_1_s) <- c(names(modDat_1),
                       paste0(prednames, "_s")
                       )
  
scaleFigDat_1 <- modDat_1_s %>% 
  select(c(Long, Lat, Year, prednames)) %>% 
  pivot_longer(cols = all_of(names(prednames)), 
               names_to = "predNames", 
               values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>% 
  select(c(Long, Lat, Year,paste0(prednames, "_s"))) %>% 
  pivot_longer(cols = all_of(paste0(prednames, "_s")), 
               names_to = "predNames", 
               values_to = "predValues_scaled", 
               names_sep = ) %>% 
  mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))

scaleFigDat_3 <- scaleFigDat_1 %>% 
  left_join(scaleFigDat_2)

ggplot(scaleFigDat_3) + 
  facet_wrap(~predNames, scales = "free") +
  geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") + 
  geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
  xlab ("predictor variable values") + 
  ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")

Model Fitting

Visualize the level 2 ecoregions and how they differ across environmental space

## visualize the variation between groups across environmental space
## make data into spatial format
modDat_1_sf <- modDat_1 %>% 
  st_as_sf(coords = c("Long", "Lat"), crs = st_crs("PROJCRS[\"unnamed\",\n    BASEGEOGCRS[\"unknown\",\n        DATUM[\"unknown\",\n            ELLIPSOID[\"Spheroid\",6378137,298.257223563,\n                LENGTHUNIT[\"metre\",1,\n                    ID[\"EPSG\",9001]]]],\n        PRIMEM[\"Greenwich\",0,\n            ANGLEUNIT[\"degree\",0.0174532925199433,\n                ID[\"EPSG\",9122]]]],\n    CONVERSION[\"Lambert Conic Conformal (2SP)\",\n        METHOD[\"Lambert Conic Conformal (2SP)\",\n            ID[\"EPSG\",9802]],\n        PARAMETER[\"Latitude of false origin\",42.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8821]],\n        PARAMETER[\"Longitude of false origin\",-100,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8822]],\n        PARAMETER[\"Latitude of 1st standard parallel\",25,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8823]],\n        PARAMETER[\"Latitude of 2nd standard parallel\",60,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8824]],\n        PARAMETER[\"Easting at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8826]],\n        PARAMETER[\"Northing at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8827]]],\n    CS[Cartesian,2],\n        AXIS[\"easting\",east,\n            ORDER[1],\n            LENGTHUNIT[\"metre\",1,\n                ID[\"EPSG\",9001]]],\n        AXIS[\"northing\",north,\n            ORDER[2],\n            LENGTHUNIT[\"metre\",1,\n                ID[\"EPSG\",9001]]]]"))


## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames, "_s")])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))

if(ecoregion == "shrubGrass") {
  print("We'll combine the 'Mediterranean California' and 'Western Sierra Madre Piedmont' ecoregions (into 'Mediterranean Piedmont'). We'll also combine `Tamaulipas-Texas semiarid plain' and 'South Central semiarid prairies' ecoregions (into (`Semiarid plain and prairies`)" )
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
  modDat_1[modDat_1$NA_L2NAME %in% c("MEDITERRANEAN CALIFORNIA", "WESTERN SIERRA MADRE PIEDMONT"), "NA_L2NAME"] <- "MEDITERRANEAN PIEDMONT"
  
  modDat_1_s[modDat_1_s$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
  modDat_1[modDat_1$NA_L2NAME %in% c("TAMAULIPAS-TEXAS SEMIARID PLAIN", "SOUTH CENTRAL SEMIARID PRAIRIES"), "NA_L2NAME"] <- "SEMIARID PLAIN AND PRAIRIES"
}
## [1] "We'll combine the 'Mediterranean California' and 'Western Sierra Madre Piedmont' ecoregions (into 'Mediterranean Piedmont'). We'll also combine `Tamaulipas-Texas semiarid plain' and 'South Central semiarid prairies' ecoregions (into (`Semiarid plain and prairies`)"
# make a table of n for each region
modDat_1 %>% 
  group_by(NA_L2NAME) %>% 
  dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>% 
  rename("Level_2_Ecoregion" = NA_L2NAME)%>% 
  kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Level_2_Ecoregion Number_Of_Observations
COLD DESERTS 82588
MEDITERRANEAN PIEDMONT 3453
SEMIARID PLAIN AND PRAIRIES 26007
TEMPERATE PRAIRIES 3226
TEXAS-LOUISIANA COASTAL PLAIN 2194
WARM DESERTS 12022
WEST-CENTRAL SEMIARID PRAIRIES 19421

Then, look at the spatial distribution and environmental characteristics of the grouped ecoregions

# download map info for visualization
us_states <- suppressMessages(tigris::states())


cropped_states <- suppressMessages(us_states %>%
  dplyr::filter(NAME!="Hawaii") %>%
  dplyr::filter(NAME!="Alaska") %>%
  dplyr::filter(NAME!="Puerto Rico") %>%
  dplyr::filter(NAME!="American Samoa") %>%
  dplyr::filter(NAME!="Guam") %>%
  dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
  dplyr::filter(NAME!="United States Virgin Islands") %>%

  sf::st_sf() %>%
  sf::st_transform(sf::st_crs(modDat_1_sf))) #%>%
  #sf::st_crop(sf::st_bbox(modDat_1_sf)+c(-1,-1,1,1))

map1 <- ggplot() +
  geom_sf(data=cropped_states,fill='white') +
  geom_sf(data=modDat_1_sf,aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
  geom_point(data=modDat_1,alpha=0.5, 
             aes(x = Long, y = Lat, color=as.factor(NA_L2NAME))) +
  #scale_fill_okabeito() +
  #scale_color_okabeito() +
 # theme_default() +
  theme(legend.position = 'none') +
  labs(title = "Level 2 Ecoregions as spatial blocks")

hull <- modDat_1_sf %>%
  ungroup() %>%
  group_by(NA_L2NAME) %>%
  slice(chull(tmean, prcp))

plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
  geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
  geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
  theme_minimal() + xlab("Annual Average T_mean - long-term average") +
  ylab("Annual Average Precip - long-term average") #+
  #scale_color_okabeito() +
  #scale_fill_okabeito()

plot2<-ggplot(data=modDat_1_sf %>%
                pivot_longer(cols=tmean:prcp),
              aes(x=value,group=name)) +
  # geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
  geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
  theme_minimal() +
  facet_wrap(~name,scales='free')# +
  #scale_color_okabeito() +
  #scale_fill_okabeito()
 
library(patchwork)
(combo <- (map1+plot1)/plot2) 

## Currently, removing data from the “Texas Coastal Plain”, since it’s quite different from other regions and is really messing with model fit

modDat_1 <- modDat_1 %>% 
  filter(NA_L2NAME != "TEXAS-LOUISIANA COASTAL PLAIN")

modDat_1_s <- modDat_1_s %>% 
  filter(NA_L2NAME != "TEXAS-LOUISIANA COASTAL PLAIN")

Fit a global model with all of the data

First, fit a LASSO regression model using the glmnet R package

  • This regression is a Gamma glm with a log link
  • Use cross validation across level 2 ecoregions to tune the lambda parameter in the LASSO model
  • this model is fit to using the scaled weather/climate/soils variables
  • this list of possible predictors includes:
    1. main effects
    2. interactions between all soils variables
    3. interactions between climate and weather variables
    4. transformed main effects (squared, log-transformed (add a uniform integer – 20– to all variables prior to log-transformation), square root-transformed (add a uniform integer – 20– to all variables prior to log-transformation))
## 
## Call:  cv.glmnet(x = X[, 2:ncol(X)], y = y, type.measure = "mse", foldid = my_folds,      keep = TRUE, parallel = TRUE, family = stats::Gamma(link = "log"),      alpha = 1, nlambda = 100, standardize = FALSE) 
## 
## Measure: Mean-Squared Error 
## 
##      Lambda Index Measure    SE Nonzero
## min 0.00357    48   135.9 22.03     119
## 1se 0.04830    20   157.4 24.42      12

Then, fit regular glm models (Gamma glm with a log link), first using the coefficients from the ‘best’ lambda identified in the LASSO model, as then using the coefficients from the ‘1SE’ lambda identified from the LASSO (this is the value of lambda such that the cross validation error is within 1 standard error of the minimum).

## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
  mat_glmnet_best <- as.matrix(bestLambda_coef)
  mat2_glmnet_best <- mat_glmnet_best[mat_glmnet_best[,1] != 0,]
  names(mat2_glmnet_best) <- rownames(mat_glmnet_best)[mat_glmnet_best[,1] != 0]

  if(length(mat2_glmnet_best) == 1) {
    f_glm_bestLambda <- as.formula(paste0(response, "~ 1"))
  } else {
  f_glm_bestLambda <- as.formula(paste0(response, " ~ ", paste0(names( mat2_glmnet_best)[2:length(names( mat2_glmnet_best))], collapse = " + ")))
  }
  

  fit_glm_bestLambda <- glm(data = modDat_1_s
                              , formula = 
  f_glm_bestLambda,
               ,family =  stats::Gamma(link = "log")
    )
  
   ## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
  mat_glmnet_1se <- as.matrix(seLambda_coef)
  mat2_glmnet_1se <- mat_glmnet_1se[mat_glmnet_1se[,1] != 0,]
  names(mat2_glmnet_1se) <- rownames(mat_glmnet_1se)[mat_glmnet_1se[,1] != 0]
  if(length(mat2_glmnet_1se) == 1) {
    f_glm_1se <- as.formula(paste0(response, "~ 1"))
  } else {
  f_glm_1se <- as.formula(paste0(response, " ~ ", paste0(names( mat2_glmnet_1se)[2:length(names( mat2_glmnet_1se))], collapse = " + ")))
  }


  fit_glm_se <- glm(data = modDat_1_s, formula = 
  f_glm_1se,
               ,family =  stats::Gamma(link = "log")
    )

Then, we predict (on the training set) using both of these models (best lambda and 1se lambda)

  ## predict on the test data
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_glm_bestLambda, newx=X[,2:ncol(X)], type = "response")
  optimal_pred_1se <-  predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response")
    null_fit <- glm(#data = data.frame("y" = y, X[,paste0(prednames, "_s")]), 
      formula = y ~ 1, family = stats::Gamma(link = "log"))
  null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
                       )

  # save data
  fullModOut <- list(
    "modelObject" = fit,
    "nullModelObject" = null_fit,
    "modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
      obs=y,
                    pred_opt=optimal_pred, 
                    pred_opt_se = optimal_pred_1se,
                    pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ))
  
  
# calculate correlations between null and optimal model 
my_cors <- c(cor(optimal_pred, y),
             cor(optimal_pred_1se, y), 
            cor(null_pred, y)
            )

# calculate mse between null and optimal model 
my_mse <- c(mean((fullModOut$modelPredictions$pred_opt -  y)^2) ,
            mean((fullModOut$modelPredictions$pred_opt_se -  y)^2) ,
            mean((fullModOut$modelPredictions$pred_null - y)^2)#,
            #mean((obs_pred$pred_nopenalty - obs_pred$obs)^2)
            )

ggplot() + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$obs), col = "black", alpha = .1) + 
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
  geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) + 
  labs(title = "A rough comparison of observed and model-predicted values", 
       subtitle = "black = observed values \n red = predictions from 'best lambda' model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
  xlab(colnames(X)[2])

  #ylim(c(0,200))

The internal cross-validation process to fit the global LASSO model identified an optimal lambda value (regularization parameter) of r{print(best_lambda)}. The lambda value such that the cross validation error is within 1 standard error of the minimum (“1se lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients were kept in each model:

# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>% 
  data.frame() 
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model 
coef_glm_1se <- coef(fit_glm_se) %>% 
  data.frame() 
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_1se) %>% 
  select(coefficientName, coefficientValue_bestLambda, coefficientValue_1seLambda)

globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]

kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model", "Value from 1se lambda model")
      ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) 
Coefficient Name Value from best lambda model Value from 1se lambda model
(Intercept) 2.5831242 2.6637533
tmean_s -0.1243158 NA
prcp_s -0.0409693 NA
prcp_seasonality_s -0.0605209 NA
prcpTempCorr_s -0.1441793 -0.2699495
annWatDef_s 0.1694392 NA
sand_s 0.0308906 0.0330081
coarse_s 0.0720718 0.0512703
tmax_anom_s -0.0490631 -0.0621642
t_warm_anom_s 0.0322684 NA
prcp_seasonality_anom_s 0.0013698 NA
prcpTempCorr_anom_s 0.0236428 NA
aboveFreezingMonth_anom_s -0.0254688 NA
isothermality_anom_s -0.0084954 NA
annWatDef_anom_s -0.0541469 NA
annWetDegDays_anom_s -0.0315726 NA
VPD_mean_anom_s -0.0123389 NA
I(tmean_s^2) 0.0143084 0.0726989
I(prcp_s^2) 0.0703173 NA
I(prcp_seasonality_s^2) 0.0322623 NA
I(prcpTempCorr_s^2) -0.0408715 -0.0567457
I(isothermality_s^2) -0.0001317 NA
I(annWatDef_s^2) -0.2719945 NA
I(t_warm_anom_s^2) -0.0081773 NA
I(prcp_wet_anom_s^2) 0.0000329 NA
I(precp_dry_anom_s^2) -0.0044645 NA
I(prcpTempCorr_anom_s^2) 0.0018465 NA
I(aboveFreezingMonth_anom_s^2) -0.0065246 NA
I(annWatDef_anom_s^2) 0.0009591 NA
I(annWetDegDays_anom_s^2) -0.0086279 NA
I(VPD_min_anom_s^2) -0.0138688 NA
I(sand_s^2) -0.0288796 -0.0323393
I(coarse_s^2) -0.0211539 NA
I(carbon_s^2) -0.0033158 NA
I(AWHC_s^2) -0.0179819 -0.0133662
aboveFreezingMonth_anom_s:isothermality_anom_s 0.0093403 NA
prcpTempCorr_s:aboveFreezingMonth_anom_s -0.0183931 NA
tmean_s:aboveFreezingMonth_anom_s 0.0051063 NA
aboveFreezingMonth_anom_s:VPD_min_anom_s 0.0037625 NA
isothermality_anom_s:annWatDef_anom_s 0.0042007 NA
annWatDef_anom_s:isothermality_s -0.0447078 NA
annWatDef_anom_s:precp_dry_anom_s -0.0219362 NA
t_warm_anom_s:annWatDef_anom_s -0.0093485 NA
annWatDef_anom_s:VPD_mean_anom_s -0.0167061 NA
annWatDef_s:annWetDegDays_anom_s 0.0186062 NA
annWatDef_s:isothermality_s -0.1284059 NA
prcpTempCorr_s:annWatDef_s -0.0367759 0.1492655
tmean_s:annWatDef_s 0.3765939 NA
isothermality_anom_s:annWetDegDays_anom_s -0.0067057 NA
annWetDegDays_anom_s:isothermality_s -0.0101418 NA
prcp_seasonality_anom_s:annWetDegDays_anom_s 0.0052195 NA
annWetDegDays_anom_s:prcp_wet_anom_s 0.0040362 NA
annWetDegDays_anom_s:precp_dry_anom_s 0.0065257 NA
annWetDegDays_anom_s:VPD_min_anom_s -0.0090637 NA
isothermality_anom_s:frostFreeDays_5_anom_s 0.0091919 NA
isothermality_s:frostFreeDays_5_anom_s 0.0312872 NA
prcp_s:frostFreeDays_5_anom_s -0.0042151 NA
prcp_seasonality_s:frostFreeDays_5_anom_s -0.0110195 NA
prcp_wet_anom_s:frostFreeDays_5_anom_s -0.0079416 NA
prcpTempCorr_anom_s:frostFreeDays_5_anom_s 0.0044353 NA
precp_dry_anom_s:frostFreeDays_5_anom_s -0.0057470 NA
tmean_s:frostFreeDays_5_anom_s -0.0124822 NA
VPD_mean_anom_s:frostFreeDays_5_anom_s 0.0100823 NA
isothermality_anom_s:isothermality_s -0.0197807 NA
prcp_s:isothermality_anom_s 0.0155560 NA
isothermality_anom_s:prcp_wet_anom_s -0.0101967 NA
prcpTempCorr_anom_s:isothermality_anom_s -0.0035152 NA
prcpTempCorr_s:isothermality_anom_s 0.0131749 NA
isothermality_anom_s:precp_dry_anom_s -0.0021722 NA
tmean_s:isothermality_anom_s 0.0150164 NA
isothermality_anom_s:VPD_mean_anom_s 0.0066636 NA
prcp_seasonality_anom_s:isothermality_s 0.0270903 NA
prcp_seasonality_s:isothermality_s 0.0088186 NA
isothermality_s:prcp_wet_anom_s -0.0185408 NA
t_warm_anom_s:isothermality_s 0.0313736 NA
tmax_anom_s:isothermality_s -0.0513958 NA
tmean_s:isothermality_s 0.0687503 NA
VPD_mean_anom_s:isothermality_s -0.0051472 NA
prcp_s:prcp_seasonality_s -0.0571638 NA
prcp_s:prcpTempCorr_anom_s -0.0075738 NA
prcp_s:prcpTempCorr_s 0.0563162 NA
prcp_s:VPD_mean_anom_s 0.0316055 NA
prcp_seasonality_s:prcp_seasonality_anom_s 0.0122820 NA
t_warm_anom_s:prcp_seasonality_anom_s -0.0137300 NA
prcp_seasonality_anom_s:VPD_mean_anom_s -0.0013042 NA
prcp_seasonality_anom_s:VPD_min_anom_s 0.0394177 NA
prcp_seasonality_s:prcp_wet_anom_s -0.0027598 NA
prcp_seasonality_s:prcpTempCorr_anom_s 0.0325765 NA
prcp_seasonality_s:prcpTempCorr_s -0.0745064 NA
prcp_seasonality_s:t_warm_anom_s 0.0099810 NA
prcp_seasonality_s:tmax_anom_s 0.0352526 NA
tmean_s:prcp_seasonality_s 0.0671467 0.0572175
prcp_seasonality_s:tmin_anom_s -0.0160613 NA
prcpTempCorr_anom_s:prcp_wet_anom_s 0.0024090 NA
prcpTempCorr_s:prcp_wet_anom_s -0.0068373 NA
precp_dry_anom_s:prcp_wet_anom_s 0.0002547 NA
VPD_mean_anom_s:prcp_wet_anom_s 0.0168412 NA
VPD_min_anom_s:prcp_wet_anom_s -0.0357394 NA
prcpTempCorr_s:prcpTempCorr_anom_s 0.0310526 NA
t_warm_anom_s:prcpTempCorr_anom_s -0.0218969 NA
tmax_anom_s:prcpTempCorr_anom_s -0.0049940 NA
tmean_s:prcpTempCorr_anom_s -0.0262282 NA
prcpTempCorr_anom_s:VPD_mean_anom_s -0.0029778 NA
prcpTempCorr_anom_s:VPD_min_anom_s 0.0218879 NA
prcpTempCorr_s:t_warm_anom_s 0.0035769 NA
tmean_s:prcpTempCorr_s 0.2115492 NA
prcpTempCorr_s:tmin_anom_s -0.0292778 NA
tmean_s:precp_dry_anom_s -0.0134025 NA
tmax_anom_s:t_warm_anom_s -0.0148192 NA
t_warm_anom_s:tmin_anom_s 0.0163254 NA
t_warm_anom_s:VPD_mean_anom_s -0.0227248 NA
tmean_s:tmax_anom_s 0.0088978 NA
tmax_anom_s:tmin_anom_s 0.0089709 NA
tmax_anom_s:VPD_mean_anom_s 0.0203610 NA
tmax_anom_s:VPD_min_anom_s -0.0162220 NA
carbon_s:AWHC_s 0.0011345 NA
sand_s:AWHC_s -0.0024093 NA
coarse_s:carbon_s -0.0404514 NA
sand_s:carbon_s -0.0735050 NA
sand_s:coarse_s 0.0301540 NA
isothermality_s NA 0.0051709
isothermality_s:isothermality_anom_s NA -0.0596827

Visualizations of Model Predictions and Residuals – using best lambda model

observed vs. predicted values

Predicting on the data

  # create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
  df_out <- df
  response_name <- paste0(response, "_pred")
  df_out <- df_out %>% cbind(predict(mod, newx= df_out, #s="lambda.min", 
                                     type = "response"))
   colnames(df_out)[ncol(df_out)] <- response_name
  return(df_out)
}

pred_glm1 <- predict_by_response(fit_glm_bestLambda, X[,2:ncol(X)])

# add back in true y values
pred_glm1 <- pred_glm1 %>% 
  cbind( data.frame("y" = y))
# rename the true response column to not be 'y_test' 
colnames(pred_glm1)[which(colnames(pred_glm1) == "y")] <- paste(response)

# add back in lat/long data 
pred_glm1 <- pred_glm1 %>% 
  cbind(modDat_1_s[,c("Long", "Lat", "Year")])

pred_glm1$resid <- pred_glm1[,response] - pred_glm1[,paste0(response, "_pred")]
pred_glm1$extremeResid <- NA
pred_glm1[pred_glm1$resid > 70 | pred_glm1$resid < -70,"extremeResid"] <- 1

# plot(x = pred_glm1[,response],
#      y = pred_glm1[,paste0(response, "_pred")],
#      xlab = "observed values", ylab = "predicted values")
# points(x = pred_glm1[!is.na(pred_glm1$extremeResid), response],
#        y = pred_glm1[!is.na(pred_glm1$extremeResid), paste0(response, "_pred")],
#        col = "red")

Maps of Observations, Predictions, and Residuals=

Observations across the temporal range of the dataset

pred_glm1 <- pred_glm1 %>% 
  mutate(resid = .[[response]] - .[[paste0(response,"_pred")]]) 

# rasterize
# get reference raster
test_rast <-  rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>% 
  terra::aggregate(fact = 8, fun = "mean")
## |---------|---------|---------|---------|=========================================                                          
# rasterize data
plotObs <- pred_glm1 %>% 
         drop_na(paste(response)) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = response, 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )
# make figures
ggplot() +
geom_spatraster(data = plotObs_2) + 
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
labs(title = paste0("Observations of ", response, " in the ",ecoregion, " ecoregion"),
     subtitle = "bestLambda model") +
  scale_fill_gradient2(low = "brown",
                       mid = "wheat" ,
                       high = "darkgreen" , 
                       midpoint = 0,   na.value = "lightgrey")

Predictions across the temporal range of the dataset

# rasterize data
plotPred <- pred_glm1 %>% 
         drop_na(paste0(response,"_pred")) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = paste0(response,"_pred"), 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the point location of those predictions that are > 100
highPred_points <- pred_glm1 %>% 
  filter(.[[paste0(response,"_pred")]] > 100 | 
           .[[paste0(response, "_pred")]] < 0) %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotPred, na.rm = TRUE)

plotPred_2 <- plotPred %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )
# make figures
ggplot() +
geom_spatraster(data = plotPred_2) + 
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + geom_sf(data = highPred_points, col = "red") +
labs(title = paste0("Predictions from the fitted model of ", response, " in the ",ecoregion, " ecoregion"),
     subtitle =  "bestLambda model")  +
  scale_fill_gradient2(low = "wheat",
                       mid = "darkgreen",
                       high = "red" , 
                       midpoint = 100,   na.value = "lightgrey",
                       limits = c(0,100))

Residuals across the entire temporal extent of the dataset

# rasterize data
plotResid_rast <- pred_glm1 %>% 
         drop_na(resid) %>% 
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) %>% 
  terra::rasterize(y = test_rast, 
                   field = "resid", 
                   fun = mean) #%>% 
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>% 
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

# get the extent of this particular raster, and crop it accordingly
tempExt <- crds(plotResid_rast, na.rm = TRUE)

plotResid_rast_2 <- plotResid_rast %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- pred_glm1 %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- pred_glm1 %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
# make figures
map <- ggplot() +
geom_spatraster(data = 
plotResid_rast_2) + 
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA )  + 
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Resids. (obs. - pred.) from Grass/shrub ecoregion-wide model of ", response),
     subtitle = "bestLambda model \n red points indication locations that have residuals below -100 \n blue points indicate locatiosn that have residuals above 100") +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" , 
                       midpoint = 0,   na.value = "lightgrey",
                       limits = c(-100,100)
                       )
hist <- ggplot(pred_glm1) + 
  geom_histogram(aes(resid), fill = "lightgrey", col = "darkgrey") + 
  geom_text(aes(x = min(resid)*.9, y = 1500, label = paste0("min = ", round(min(resid),2)))) +
  geom_text(aes(x = max(resid)*.9, y = 1500, label = paste0("max = ", round(max(resid),2))))

library(ggpubr)
ggarrange(map, hist, heights = c(3,1), ncol = 1)

Quantile plots

Binning predictor variables into “Deciles” (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word Decentiles is just a legacy thing (they started out being actual Percentiles)

var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)

prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE)) 
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1_deciles <- predvars2deciles(pred_glm1,
                                      response_vars = response_vars,
                                        pred_vars = prednames_fig)
}

Here is a quick version of LOESS curves fit to raw data (to double-check the quantile plot calculations)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
  pred_glm1 %>%
  select(all_of(c(prednames_fig, response_vars))) %>%
  pivot_longer(cols = prednames_fig)  %>%
  ggplot() +
  facet_wrap(~name, scales = "free") +
  geom_point(aes(x = value, y =  .data[[paste(response)]]), col = "darkblue", alpha = .1)  + # observed values
  geom_point(aes(x = value, y = .data[[response_vars[2]]]), col = "lightblue", alpha = .1) + # model-predicted values
  geom_smooth(aes(x = value, y =  .data[[paste(response)]]), col = "black", se = FALSE) +
  geom_smooth(aes(x = value, y = .data[[response_vars[2]]]), col = "brown", se = FALSE)

}

Below are the actual quantile plots (note that the predictor variables are scaled)

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {

# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles, response = response) + ggtitle("Decile Plot")

g4 <- add_dotplot_inset(g3, pred_glm1_deciles)


  
if(save_figs) {
  png(paste0("figures/quantile_plots/quantile_plot_", response,  "_",ecoregion,".png"), 
     units = "in", res = 600, width = 5.5, height = 3.5 )
    print(g4)
  dev.off()
}

g4
}

Deciles Filtered

20th and 80th percentiles for each climate variable

df <- pred_glm1[, prednames_fig] #%>% 
  #mutate(MAT = MAT - 273.15) # k to c
quantiles <- map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)

Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each predictor variable.

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = prednames_fig,
                         filter_var = TRUE,
                         filter_vars = prednames_fig) 

decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = prednames_fig)
#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)

}

Filtered quantile figure with middle 2 deciles also shown

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1, 
                         response_vars = response_vars,
                         pred_vars = prednames_fig,
                         filter_vars = prednames_fig,
                         filter_var = TRUE,
                         add_mid = TRUE)

g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, xvars = prednames_fig)
g

if(save_figs) {x
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
     units = "in", res = 600, width = 5.5, height = 6 )
  g 
dev.off()
}
}

Cross-validation

Use terms from global model to re-fit and predict on different held out regions

Figures show residuals for each of the models fit to held-out ecoregions

These models were fit to six ecoregions, and then predict on the indicated heldout ecoregion

if (length(prednames_fig) == 0) {
  print("The model only contains one predictor (an intercept), so cross validation isn't practical")
} else {



## code from Tredennick et al. 2020
# try each separate level II ecoregion as a test set
# make a list to hold output data
outList <- vector(mode = "list", length = length(sort(unique(modDat_1$NA_L2NAME))))
# obs_pred <- data.frame(ecoregion = character(),obs = numeric(),
#                        pred_opt = numeric(), pred_null = numeric()#,
#                        #pred_nopenalty = numeric()
#                        )

## get the model specification from the global model
mat <- as.matrix(coef(fit_glm_bestLambda, s = "lambda.min"))
mat2 <- mat[mat[,1] != 0,]

f_cv <- as.formula(paste0(response, " ~ ", paste0(names(mat2)[2:length(names(mat2))], collapse = " + ")))

X_cv <- model.matrix(object = f_cv, data = modDat_1_s)
# get response variable
y_cv <- as.matrix(modDat_1_s[,response])

  
# now, loop through so with each iteration, a different ecoregion is held out
 for(i_eco in sort(unique(modDat_1_s$NA_L2NAME))){

  # split into training and test sets
  test_eco <- i_eco
  print(test_eco)
  # identify the rowID of observations to be in the training and test datasets
  train <- which(modDat_1_s$NA_L2NAME!=test_eco) # data for all ecoregions that aren't 'i_eco'
  test <- which(modDat_1_s$NA_L2NAME==test_eco) # data for the ecoregion that is 'i_eco'

  trainDat_all <- modDat_1_s %>% 
    slice(train) %>% 
    select(-newRegion)
  testDat_all <- modDat_1_s %>% 
    slice(test) %>% 
    select(-newRegion)

  # get the model matrices for input and response variables for cross validation model specification
  X_train <- as.matrix(X_cv[train,])
  X_test <- as.matrix(X_cv[test,])

  y_train <- modDat_1_s[train,response]
  y_test <- modDat_1_s[test,response]
  
  # get the model matrices for input and response variables for original model specification
  X_train_glob <- as.matrix(X[train,])
  X_test_glob <- as.matrix(X[test,])

  y_train_glob <- modDat_1_s[train,response]
  y_test_glob <- modDat_1_s[test,response]

  train_eco <- modDat_1_s$NA_L2NAME[train]

  # Fit model using glm-----------------------------------------------
#
#   # specify leave-one-year-out cross-validation
#   my_folds <- as.numeric(as.factor(train_eco))
#
  # fit_i <- glmnet(
  #   x = X_train[,2:ncol(X_train)],
  #   y = y_train,
  #   #family = gaussian,
  #   family = stats::Gamma(link = "log"),
  #   alpha = 1,  # 0 == ridge regression, 1 == lasso, 0.5 ~~ elastic net
  #   #lambda = lambdas,
  #   #type.measure="mse",
  #   #penalty.factor = pen_facts,
  #   foldid = my_folds,
  #   standardize = TRUE
  # )

  ## just try a regular glm w/ the components from the global model
  fit_i <- glm(data = trainDat_all, formula = f_cv, 
    # data = trainDat_all, 
    # formula = TotalTreeCover ~ prcp_s + prcp_seasonality_s + prcpTempCorr_s + 
    # carbon_s + AWHC_s + t_warm_anom_s + prcp_wet_anom_s + annWetDegDays_anom_s +
    # VPD_min_anom_s + I(prcp_seasonality_s^2) + I(prcpTempCorr_s^2) +
    # I(isothermality_s^2) + I(annWatDef_s^2) + I(tmin_anom_s^2) +
    # I(precp_dry_anom_s^2) + I(prcpTempCorr_anom_s^2) + I(annWatDef_anom_s^2) +
    # I(VPD_min_anom_s^2) + I(sand_s^2) + I(AWHC_s^2) + annWatDef_s:annWetDegDays_anom_s +
    # prcpTempCorr_s:annWatDef_s + prcp_seasonality_anom_s:annWetDegDays_anom_s +
    # prcpTempCorr_anom_s:annWetDegDays_anom_s + prcpTempCorr_s:annWetDegDays_anom_s +
    # tmean_s:frostFreeDays_5_anom_s + prcp_s:isothermality_s +
    # t_warm_anom_s:prcp_seasonality_anom_s + tmin_anom_s:t_warm_anom_s +
    # sand_s:AWHC_s
    ,
               family =  stats::Gamma(link = "log")
    )

    coef(fit_i)
    
  # lasso model predictions with the optimal lambda
  optimal_pred <- predict(fit_i, newdata= testDat_all, type = "response"
                          )
  # null model and predictions
  # the "null" model in this case is the global model
  # predict on the test data for this iteration w/ the global model 
  null_pred <- predict.glm(fit_glm_bestLambda, newdata = testDat_all, type = "response")

  # save data
  tmp <- data.frame(ecoRegion_holdout = rep(test_eco,length(y_test)),obs=y_test,
                    pred_opt=optimal_pred, pred_null=null_pred#,
                    #pred_nopenalty=nopen_pred
                    ) %>%
    cbind(testDat_all)
  # put output into a list
  tmpList <- list("testRegion" = i_eco,
    "modelObject" = fit_i,
       "modelPredictions" = tmp)

  # save model outputs
  outList[[which(sort(unique(modDat_1_s$NA_L2NAME)) == i_eco)]] <- tmpList
  # save one example of model fits
  # if(i_eco==sort(unique(modDat_1_s$NA_L2NAME))[1]){
  #   # save model object
  #   saveRDS(fit,file="./ModelResults/glmnet_fit.RDS")
  #
  #   #saveRDS(nopen,file="./ModelResults/nopenalty_fit.RDS")
  # }
 }

#
# # calculate correlations between null and optimal model
# my_cors <- c(cor(outList[[i]]$modelPredictions$pred_opt, outList[[i]]$modelPredictions$obs), # correlation of LOO model's predictions w/ the observations
#             cor(outList[[i]]$modelPredictions$s1, outList[[i]]$modelPredictions$obs)#, # correlations of LOO model's predictions w/ global model's predictions
#             #cor(obs_pred$pred_nopenalty,obs_pred$obs)
#             )
# 
# # calculate mse between null and optimal model
# my_mse <- c(mean((outList[[i]]$modelPredictions$pred_opt - outList[[i]]$modelPredictions$obs)^2) ,
#             mean((outList[[i]]$modelPredictions$s1 - outList[[i]]$modelPredictions$obs)^2)#,
#             #mean((obs_pred$pred_nopenalty - obs_pred$obs)^2)
#             )

# # print results
# names(my_cors) <- names(my_mse) <- c("optimal","null"#,"no penalty"
#                                      )
# print(my_cors)
# print(my_mse)
#
# # save obs vs pred results
# write.csv(obs_pred,"./ModelResults/obs_vs_pred.csv",row.names=F)

for (i in 1:length(unique(modDat_1_s$NA_L2NAME))) {
  holdoutRegion <- outList[[i]]$testRegion
  predictionData <- outList[[i]]$modelPredictions
  modTerms <- as.matrix(coef(outList[[i]]$modelObject)) %>%
    as.data.frame() %>%
    filter(V1!=0) %>%
    rownames()

  # calculate residuals
  predictionData <- predictionData %>%
  mutate(resid = .[["obs"]] - .[["pred_opt"]] ,
         resid_globMod = .[["obs"]]  - .[["pred_null"]])


# rasterize
# use 'test_rast' from earlier

  # rasterize data
plotObs <- predictionData %>%
         drop_na(paste(response)) %>%
  #slice_sample(n = 5e4) %>%
  terra::vect(geom = c("Long", "Lat")) %>%
  terra::set.crs(crs(test_rast)) %>%
  terra::rasterize(y = test_rast,
                   field = "resid",
                   fun = mean) #%>%
  #terra::aggregate(fact = 2, fun = mean, na.rm = TRUE) %>%
  #terra::crop(ext(-1950000, 1000000, -1800000, 1000000))

tempExt <- crds(plotObs, na.rm = TRUE)

plotObs_2 <- plotObs %>% 
  crop(ext(min(tempExt[,1]), max(tempExt[,1]),
           min(tempExt[,2]), max(tempExt[,2])) 
       )

# identify locations where residuals are >100 or < -100
badResids_high <- predictionData %>% 
  filter(resid > 100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 
badResids_low <- predictionData %>% 
  filter(resid < -100)  %>% 
  terra::vect(geom = c("Long", "Lat")) %>% 
  terra::set.crs(crs(test_rast)) 


# make figures
# make histogram
hist_i <- ggplot(predictionData) +
  geom_histogram(aes(resid_globMod), col = "darkgrey", fill = "lightgrey") +
  xlab(c("Residuals (obs. - pred.)"))
# make map
map_i <-  ggplot() +
geom_spatraster(data = plotObs_2) +
  geom_sf(data=cropped_states %>% st_transform(crs = st_crs(test_rast)) %>% st_crop(st_bbox(plotObs_2)),fill=NA ) +
  geom_sf(data = badResids_high, col = "blue") +
  geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Residuals (obs. - pred.) for predictions of \n", holdoutRegion, " \n from a model fit to other ecoregions"),
     subtitle = paste0(response, " ~ ", paste0( modTerms, collapse = " + "))) +
  scale_fill_gradient2(low = "red",
                       mid = "white" ,
                       high = "blue" ,
                       midpoint = 0,   na.value = "lightgrey",
                       limits = c(-100, 100))

 assign(paste0("residPlot_",holdoutRegion),
   value = ggarrange(map_i, hist_i, heights = c(3,1), ncol = 1)
)

}


  lapply(unique(modDat_1_s$NA_L2NAME), FUN = function(x) {
    get(paste0("residPlot_", x))
  })
}
## [1] "COLD DESERTS"
## [1] "MEDITERRANEAN PIEDMONT"
## [1] "SEMIARID PLAIN AND PRAIRIES"
## [1] "TEMPERATE PRAIRIES"
## [1] "WARM DESERTS"
## [1] "WEST-CENTRAL SEMIARID PRAIRIES"
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

Save output

# # glm models
# #mods2save <- butcher::butcher(mod_glmFinal) # removes some model components so the saved object isn't huge
# 
# #mods2save$formula <- best_form
# #mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
# #n <- nrow(df_sample)
# #mods2save$data_rows <- n
# 
# 
# if(!test_run) {
#   saveRDS(mods2save, 
#         paste0("./models/glm_beta_model_CONUSwide_", s, "_n", n, 
#         #sample_group, 
#         ".RDS"))
#   if (byRegion == TRUE) {
#     ## western forests
#      saveRDS(mods2save_WF, 
#         paste0("./models/glm_beta_model_WesternForests_", s, "_n", n, 
#         #sample_group, 
#         ".RDS"))
#     ## eastern forests
#      saveRDS(mods2save_EF, 
#         paste0("./models/glm_beta_model_EasternForests_", s, "_n", n, 
#         #sample_group, 
#         ".RDS"))
#      ## grass/shrub
#      saveRDS(mods2save_G, 
#         paste0("./models/glm_beta_model_GrassShrub_", s, "_n", n, 
#         #sample_group, 
#         ".RDS"))
#   }
# }
## partial dependence plots
#vip::vip(mod_glmFinal, num_features = 15)

#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)

#caret::varImp(fit)

session info

Hash of current commit (i.e. to ID the version of the code used)

system("git rev-parse HEAD", intern=TRUE)
## [1] "79890c55a196d40eb16ae968701c4515b44c260c"

Packages etc.

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.6.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0     doMC_1.3.8       iterators_1.0.14 foreach_1.5.2    factoextra_1.0.7 glmnet_4.1-8    
##  [7] Matrix_1.7-0     kableExtra_1.4.0 rsample_1.2.1    here_1.0.1       StepBeta_2.1.0   ggtext_0.1.2    
## [13] knitr_1.49       gridExtra_2.3    pdp_0.8.2        GGally_2.2.1     lubridate_1.9.4  forcats_1.0.0   
## [19] stringr_1.5.1    dplyr_1.1.4      purrr_1.0.4      readr_2.1.5      tidyr_1.3.1      tibble_3.2.1    
## [25] tidyverse_2.0.0  caret_6.0-94     lattice_0.22-6   ggplot2_3.5.1    sf_1.0-16        tidyterra_0.6.1 
## [31] terra_1.8-21     ggspatial_1.1.9  dtplyr_1.3.1     patchwork_1.3.0 
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3   rstudioapi_0.17.1    jsonlite_1.9.1       shape_1.4.6.1        magrittr_2.0.3      
##   [6] modeltools_0.2-23    farver_2.1.2         rmarkdown_2.29       vctrs_0.6.5          rstatix_0.7.2       
##  [11] htmltools_0.5.8.1    curl_6.2.1           broom_1.0.7          Formula_1.2-5        pROC_1.18.5         
##  [16] sass_0.4.9           parallelly_1.37.1    KernSmooth_2.23-22   bslib_0.9.0          plyr_1.8.9          
##  [21] sandwich_3.1-0       zoo_1.8-12           cachem_1.1.0         uuid_1.2-1           commonmark_1.9.1    
##  [26] lifecycle_1.0.4      pkgconfig_2.0.3      R6_2.6.1             fastmap_1.2.0        future_1.33.2       
##  [31] digest_0.6.37        colorspace_2.1-1     furrr_0.3.1          rprojroot_2.0.4      pkgload_1.3.4       
##  [36] labeling_0.4.3       timechange_0.3.0     mgcv_1.9-1           httr_1.4.7           abind_1.4-8         
##  [41] compiler_4.4.0       proxy_0.4-27         aod_1.3.3            withr_3.0.2          backports_1.5.0     
##  [46] carData_3.0-5        betareg_3.1-4        DBI_1.2.3            ggstats_0.9.0        ggsignif_0.6.4      
##  [51] MASS_7.3-60.2        lava_1.8.0           rappdirs_0.3.3       classInt_0.4-10      gtools_3.9.5        
##  [56] ModelMetrics_1.2.2.2 tools_4.4.0          units_0.8-5          lmtest_0.9-40        future.apply_1.11.2 
##  [61] nnet_7.3-19          glue_1.8.0           nlme_3.1-164         gridtext_0.1.5       grid_4.4.0          
##  [66] reshape2_1.4.4       generics_0.1.3       recipes_1.1.0        gtable_0.3.6         tzdb_0.4.0          
##  [71] class_7.3-22         data.table_1.17.0    hms_1.1.3            utf8_1.2.4           xml2_1.3.7          
##  [76] car_3.1-2            flexmix_2.3-19       markdown_1.13        ggrepel_0.9.5        pillar_1.10.1       
##  [81] splines_4.4.0        survival_3.5-8       tidyselect_1.2.1     svglite_2.1.3        stats4_4.4.0        
##  [86] xfun_0.51            hardhat_1.4.0        timeDate_4032.109    stringi_1.8.4        yaml_2.3.10         
##  [91] evaluate_1.0.3       codetools_0.2-20     cli_3.6.4            rpart_4.1.23         systemfonts_1.2.1   
##  [96] munsell_0.5.1        jquerylib_0.1.4      Rcpp_1.0.14          globals_0.16.3       gower_1.0.1         
## [101] listenv_0.9.1        tigris_2.1           viridisLite_0.4.2    ipred_0.9-15         scales_1.3.0        
## [106] prodlim_2024.06.25   e1071_1.7-14         crayon_1.5.3         combinat_0.0-8       rlang_1.1.5         
## [111] cowplot_1.1.3